suppressPackageStartupMessages(library(diffcyt))
suppressPackageStartupMessages(library(rrcov))
suppressPackageStartupMessages(library(CATALYST))
suppressPackageStartupMessages(library(ExperimentHub)) 
suppressPackageStartupMessages(library(moveHMM))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(flowCore))
suppressPackageStartupMessages(library(scater))
suppressPackageStartupMessages(library(SingleCellExperiment))
suppressPackageStartupMessages(library(ExperimentHub))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(UpSetR))
suppressPackageStartupMessages(library(PCAtools))
suppressPackageStartupMessages(library(mclust))
suppressPackageStartupMessages(library(clue))

Question 1.

Grab a well-known dataset, the Zheng 10x PBMC pre-sorted dataset,

from ExperimentHub (see code below). Explore basic properties of this dataset,

including the number cells of each subpopulation (see the phenoid column of the colData),

the depth of sequencing by subpopulation and other aspects you can think of.

Re-investigate the filtering (some was already) by plotting

the percentage of mitochondrial reads versus the total number of reads.

If appropriate, additionally filter any outlier cells.

eh <- ExperimentHub()
## snapshotDate(): 2019-10-22
sce <- eh[["EH1532"]]
## snapshotDate(): 2019-10-22
## see ?DuoClustering2018 and browseVignettes('DuoClustering2018') for documentation
## loading from cache
rownames(sce) <- paste0(rowData(sce)$id, "_", rowData(sce)$symbol)
sce
## class: SingleCellExperiment 
## dim: 15716 3994 
## metadata(1): log.exprs.offset
## assays(3): counts logcounts normcounts
## rownames(15716): ENSG00000237683_AL627309.1
##   ENSG00000228327_RP11-206L10.2 ... ENSG00000215700_PNRC2
##   ENSG00000215699_SRSF10
## rowData names(10): id symbol ... total_counts log10_total_counts
## colnames(3994): b.cells1147 b.cells6276 ... naive.t1167 naive.t4926
## colData names(14): dataset barcode ... libsize.drop feature.drop
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
## altExpNames(0):

Investigating the data:

We can extract the subpopulations’ names through sce@colData:

populations=sce@colData@listData[["phenoid"]]
populations2=as.data.frame(sce@colData@listData[["phenoid"]])

Below we can see the number of cells in each of these populations:

b.cells, naive.cytotoxic,cd14.monocytes,cd4.t.helper,cd56.nk,memory.t,naive.t

sum(populations[1:3994]=="b.cells")
## [1] 499
sum(populations[1:3994]=="naive.cytotoxic")
## [1] 398
sum(populations[1:3994]=="cd14.monocytes")
## [1] 600
sum(populations[1:3994]=="cd4.t.helper")
## [1] 400
sum(populations[1:3994]=="cd56.nk")
## [1] 600
sum(populations[1:3994]=="memory.t")
## [1] 500
sum(populations[1:3994]=="naive.t")
## [1] 499
total_counts=sce@colData@listData[["total_counts"]]
bcell_depth=sum(total_counts[1:499])
naive_cytotoxic_depth=sum(total_counts[500:897])
cd14_monocytes_depth=sum(total_counts[898:1497])
cd14_t_helper_depth=sum(total_counts[1498:1897])
cd56nk_depth=sum(total_counts[1898:2497])
memory_t_depth=sum(total_counts[2498:2997])
naive_t_depth=sum(total_counts[2998:3969])
bcell_depth
## [1] 737752
naive_cytotoxic_depth
## [1] 617717
cd14_monocytes_depth
## [1] 644180
cd14_t_helper_depth
## [1] 528095
cd56nk_depth
## [1] 837739
memory_t_depth
## [1] 828500
naive_t_depth
## [1] 1393245

Plotting percentage of mitochondrial gene mapped vs. total counts:

(mito <- grep("MT-", rownames(sce), value = TRUE))
##  [1] "ENSG00000198888_MT-ND1"  "ENSG00000198763_MT-ND2" 
##  [3] "ENSG00000198804_MT-CO1"  "ENSG00000198712_MT-CO2" 
##  [5] "ENSG00000228253_MT-ATP8" "ENSG00000198899_MT-ATP6"
##  [7] "ENSG00000198938_MT-CO3"  "ENSG00000198840_MT-ND3" 
##  [9] "ENSG00000212907_MT-ND4L" "ENSG00000198886_MT-ND4" 
## [11] "ENSG00000198786_MT-ND5"  "ENSG00000198695_MT-ND6" 
## [13] "ENSG00000198727_MT-CYB"
df <- perCellQCMetrics(sce, subsets=list(Mito=mito))
df 
## DataFrame with 3994 rows and 10 columns
##            sum  detected   percent_top_50  percent_top_100  percent_top_200
##      <numeric> <integer>        <numeric>        <numeric>        <numeric>
## 1         1096       427 48.1751824817518 66.7883211678832 79.2883211678832
## 2          857       340 51.1085180863477 69.0781796966161 83.6639439906651
## 3         1593       602 46.2649089767734 63.0257376020088 74.7645951035782
## 4          743       410 42.7994616419919 57.8734858681023 71.7362045760431
## 5         1031       411 51.0184287099903 67.3132880698351 79.5344325897187
## ...        ...       ...              ...              ...              ...
## 3990      1259       545 43.6060365369341 59.0945194598888 72.5972994440032
## 3991      1259       504 46.6243050039714 62.6687847498014 75.8538522637014
## 3992      1290       614 41.0077519379845 55.0387596899225  67.906976744186
## 3993       805       365 49.9378881987578 64.9689440993789 79.5031055900621
## 3994      1224       498 46.6503267973856 63.1535947712418 75.6535947712418
##       percent_top_500 subsets_Mito_sum subsets_Mito_detected
##             <numeric>        <numeric>             <integer>
## 1                 100               21                     7
## 2                 100               21                     8
## 3    93.5969868173258               37                     9
## 4                 100               61                    10
## 5                 100               26                     9
## ...               ...              ...                   ...
## 3990 96.4257347100874               24                     8
## 3991 99.6822875297855               16                     7
## 3992 91.1627906976744               26                     9
## 3993              100                7                     4
## 3994              100               23                     9
##      subsets_Mito_percent     total
##                 <numeric> <numeric>
## 1        1.91605839416058      1096
## 2        2.45040840140023       857
## 3        2.32266164469554      1593
## 4        8.20995962314939       743
## 5        2.52182347235693      1031
## ...                   ...       ...
## 3990     1.90627482128674      1259
## 3991     1.27084988085782      1259
## 3992     2.01550387596899      1290
## 3993    0.869565217391304       805
## 3994     1.87908496732026      1224
discard.mito <- isOutlier(df$subsets_Mito_percent, type="higher")
summary(discard.mito)
##    Mode   FALSE    TRUE 
## logical    3908      86
plot(df$sum, df$subsets_Mito_percent, log="x",
     xlab="Total count", ylab='Mitochondrial %')
abline(h=attr(discard.mito, "thresholds")["higher"], col="red")

#### below we can see the distribution of number of counts/cell in all the cells of the dataset (we could also see a visual outlier):

plot(df$sum)

#### From this plot, we could remove some outliers, such as - cells containing above 10% of mitochondrial reads & containing less than 5800 counts:

qc.mito <- df$subsets_Mito_percent > 10
qc.lib <- df$sum > 5000
discard <- qc.lib | qc.mito

Displaying the sum of all the cells removed - that have mitochondrial genes below (10%) &

DataFrame(LibSize=sum(qc.lib), MitoProp=sum(qc.mito), Total=sum(discard))
## DataFrame with 1 row and 3 columns
##     LibSize  MitoProp     Total
##   <integer> <integer> <integer>
## 1        15        10        25

Keeping the columns we DON’T want to discard.

filtered <- sce[,!discard]
sce=filtered 

After filtering, we need to normalize our data as well:

library(scran)
## 
## Attaching package: 'scran'
## The following object is masked from 'package:PCAtools':
## 
##     parallelPCA
clusters <- quickCluster(sce)
sce <- computeSumFactors(sce, cluster=clusters) 
sce <- logNormCounts(sce)

Question 2.

Identify “features of interest”, which usually means highly variable genes.

There are various ways to do this (e.g., Seurat’s FindVariableFeatures or scran’s modelGeneVar).

Select features in at least two ways (say, 1000-2000 genes) and make an upset plot to compare the lists.

Quantifying per gene variation by variance of the log-counts:

Variance in the PBMC data set as a function of the mean.

Each point represents a gene while the red line represents the trend fitted to all genes.

suppressPackageStartupMessages(library(scran))
dec.pbmc <- modelGeneVar(sce)
fit.pbmc <- metadata(dec.pbmc)
plot(fit.pbmc$mean, fit.pbmc$var, xlab="Mean of log-expression",
     ylab="Variance of log-expression")
curve(fit.pbmc$trend(x), col="red", add=TRUE, lwd=2)

#### Ordering by most interesting genes for inspection.

dec.pbmc[order(dec.pbmc$bio, decreasing=TRUE),] 
## DataFrame with 15716 rows and 6 columns
##                                      mean             total              tech
##                                 <numeric>         <numeric>         <numeric>
## ENSG00000019582_CD74     1.26474331256936  2.74437092523629 0.787832656781869
## ENSG00000115523_GNLY     0.63952177892004  2.28385043090429 0.547202802756434
## ENSG00000163220_S100A9  0.498584133120091  1.67913170312111 0.452619436823626
## ENSG00000143546_S100A8  0.492033178926322  1.66864659827605 0.447777955785647
## ENSG00000204287_HLA-DRA 0.711610468263255  1.79553646646598 0.588479474393869
## ...                                   ...               ...               ...
## ENSG00000173812_EIF1     2.09963419307843 0.631699947538411 0.773803246617403
## ENSG00000161970_RPL26    2.86322949797314 0.536421674046619 0.693664303105048
## ENSG00000204525_HLA-C    2.38783488630792 0.560694785922417 0.752938297735789
## ENSG00000234745_HLA-B    2.99508368910097 0.425588062507862 0.669720843921196
## ENSG00000205542_TMSB4X   4.17218387314075 0.302824073065219 0.554934657032776
##                                        bio               p.value
##                                  <numeric>             <numeric>
## ENSG00000019582_CD74      1.95653826845442   7.4705166588826e-66
## ENSG00000115523_GNLY      1.73664762814786 3.70397836595812e-106
## ENSG00000163220_S100A9    1.22651226629748  5.41292371142383e-78
## ENSG00000143546_S100A8     1.2208686424904   6.2575549729389e-79
## ENSG00000204287_HLA-DRA   1.20705699207211  1.36957060561464e-45
## ...                                    ...                   ...
## ENSG00000173812_EIF1    -0.142103299078993     0.896967651074246
## ENSG00000161970_RPL26   -0.157242629058429     0.940716831884149
## ENSG00000204525_HLA-C   -0.192243511813372     0.960628027392819
## ENSG00000234745_HLA-B   -0.244132781413334     0.993962483886251
## ENSG00000205542_TMSB4X  -0.252110583967556     0.999120305859297
##                                           FDR
##                                     <numeric>
## ENSG00000019582_CD74     2.92041172487368e-62
## ENSG00000115523_GNLY    5.79191097084871e-102
## ENSG00000163220_S100A9   2.82139626918448e-74
## ENSG00000143546_S100A8   4.89246935559228e-75
## ENSG00000204287_HLA-DRA  3.56932925999935e-42
## ...                                       ...
## ENSG00000173812_EIF1        0.958772517591632
## ENSG00000161970_RPL26       0.987867858950806
## ENSG00000204525_HLA-C       0.999999305456306
## ENSG00000234745_HLA-B       0.999999305456306
## ENSG00000205542_TMSB4X      0.999999305456306

Quantifying per gene variation by Squared coefficient of variation:

dec.cv2.pbmc <- modelGeneCV2(sce)
fit.cv2.pbmc <- metadata(dec.cv2.pbmc)
plot(fit.cv2.pbmc$mean, fit.cv2.pbmc$cv2, log="xy")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 79 x values <= 0 omitted from
## logarithmic plot
curve(fit.cv2.pbmc$trend(x), col="red", add=TRUE, lwd=2)

#### The deviation for each gene from the trend:

dec.cv2.pbmc[order(dec.cv2.pbmc$ratio, decreasing=TRUE),]
## DataFrame with 15716 rows and 6 columns
##                                               mean            total
##                                          <numeric>        <numeric>
## ENSG00000254709_IGLL5              0.2712326359366 80.6430620109659
## ENSG00000143546_S100A8            2.01536951484566  11.425633804434
## ENSG00000163220_S100A9            1.98074332220598 10.1020978104105
## ENSG00000115523_GNLY              2.77958806920241 6.46259972848585
## ENSG00000115568_ZNF142         0.00778548688821122 1805.28752873894
## ...                                            ...              ...
## ENSG00000075702_WDR62                            0              NaN
## ENSG00000170956_CEACAM3                          0              NaN
## ENSG00000269403_CTD-2616J11.11                   0              NaN
## ENSG00000129991_TNNI3                            0              NaN
## ENSG00000223638_RFPL4A                           0              NaN
##                                            trend            ratio   p.value
##                                        <numeric>        <numeric> <numeric>
## ENSG00000254709_IGLL5           5.02761448544464 16.0400249948428         0
## ENSG00000143546_S100A8         0.791850753180504 14.4290243566005         0
## ENSG00000163220_S100A9         0.803365904447868 12.5747156488467         0
## ENSG00000115523_GNLY           0.610745834443516 10.5814880168184         0
## ENSG00000115568_ZNF142          170.647916723026 10.5790188559351         0
## ...                                          ...              ...       ...
## ENSG00000075702_WDR62                        Inf              NaN       NaN
## ENSG00000170956_CEACAM3                      Inf              NaN       NaN
## ENSG00000269403_CTD-2616J11.11               Inf              NaN       NaN
## ENSG00000129991_TNNI3                        Inf              NaN       NaN
## ENSG00000223638_RFPL4A                       Inf              NaN       NaN
##                                      FDR
##                                <numeric>
## ENSG00000254709_IGLL5                  0
## ENSG00000143546_S100A8                 0
## ENSG00000163220_S100A9                 0
## ENSG00000115523_GNLY                   0
## ENSG00000115568_ZNF142                 0
## ...                                  ...
## ENSG00000075702_WDR62                NaN
## ENSG00000170956_CEACAM3              NaN
## ENSG00000269403_CTD-2616J11.11       NaN
## ENSG00000129991_TNNI3                NaN
## ENSG00000223638_RFPL4A               NaN

Taking the top 1000 genes based on the variance of the log-counts:

hvg.pbmc.var <- getTopHVGs(dec.pbmc, n=2000)
str(hvg.pbmc.var)
##  chr [1:2000] "ENSG00000019582_CD74" "ENSG00000115523_GNLY" ...
way1=list(str(hvg.pbmc.var))
##  chr [1:2000] "ENSG00000019582_CD74" "ENSG00000115523_GNLY" ...

Taking top genes based on significance: our HVGs are defined as all genes that have adjusted p-values below 0.05.

hvg.pbmc.var.2 <- getTopHVGs(dec.pbmc, fdr.threshold = 0.05)
way2=list(hvg.pbmc.var.2)

Making an Upset plot comparisons of the two:

suppressPackageStartupMessages(library(UpSetR))
way1=as.data.frame(hvg.pbmc.var)
way2=as.data.frame(hvg.pbmc.var.2)

Question 3. Re-calculate the low dimensional projection using your preferred set of selected features

and produce some visualizations. For example, after re-running PCA, use the scater package to run the UMAP

algorithm. Make multiple plots of the UMAP coordinates according to cell type (this is known in advance for

this dataset), depth of sequencing and anything else you might find appropriate.

suppressPackageStartupMessages(library(scater))
sce=runPCA(sce,subset_row=hvg.pbmc.var)

percent.var <- attr(reducedDim(sce), "percentVar")
chosen.elbow <- PCAtools::findElbowPoint(percent.var)
chosen.elbow
## [1] 6
plot(percent.var, xlab="PC", ylab="Variance explained (%)")
abline(v=chosen.elbow, col="red")

reducedDim(sce, "PCA") <- reducedDim(sce, "PCA")[,1:10]
ncol(reducedDim(sce, "PCA"))
## [1] 10
reducedDim(sce, "PCA_10") <- reducedDim(sce, "PCA")[,1:10]
reducedDimNames(sce)
## [1] "PCA"    "TSNE"   "PCA_10"
plotReducedDim(sce, dimred="PCA",colour_by = "phenoid")

sce<- runUMAP(sce, dimred="PCA")
reducedDimNames(sce)
## [1] "PCA"    "TSNE"   "PCA_10" "UMAP"
dim(reducedDim(sce, "UMAP"))
## [1] 3969    2
plotReducedDim(sce, dimred="UMAP",colour_by = "phenoid")

plotReducedDim(sce, dimred="UMAP",colour_by = "log10_total_features")

plotReducedDim(sce, dimred="UMAP",colour_by = "log10_total_counts")

Question 4. Run at least 2 algorithms to cluster the data and make some comparisons.

One should be graph-based clustering as this seems to perform well, generally speaking.

Calculate the F1 score for each cell type (solve_LSAP in the clue package may be useful for matching

of clsuters to true populations) and the adjusted rand index (adjustedRandIndex in the mclust package,

for example) for an overall score. What cell types are more difficult to separate with clustering?

Run one of the algorithms at different numbers of clusters and plot a curve of the performance

(e.g., adjusted rand index) as a function of the number of clusters.

First of the 2 algorithms for clustering - the graph based clustering.

After the clustering, we assign this graph based clustering results back into the singlecellexperiment & make a t-SNE plot:

suppressPackageStartupMessages(library(scran))
g <- buildSNNGraph(sce, k=18, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7 
## 578 516 945 588 502 411 429
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
plotReducedDim(sce, "PCA", colour_by="cluster")

#### Second of the 2 algorithms for clustering - K-means custering:

sce1=sce
clust.kmeans <- kmeans(reducedDim(sce, "PCA"), centers=7)
table(clust.kmeans$cluster)
## 
##   1   2   3   4   5   6   7 
## 719 576 291 587 690 504 602
sce1$cluster <- factor(clust.kmeans$cluster)
plotReducedDim(sce1, "TSNE", colour_by="cluster") 

#### Calculating the adjusted Rand Index of the clusters generated by PCA & k-means:

adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
## [1] 0.7410227

Run one of the algorithms at different numbers of clusters and plot a curve of the performance

(e.g., adjusted rand index) as a function of the number of clusters.

suppressPackageStartupMessages(library(scran))
g <- buildSNNGraph(sce, k=2, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  45  31 264 122 292 356 124  30 421 159 372 360 134 152 245 151  47  63  29  45 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##  74  33  15  17   7  13  17  17  16  31  12   8   6  13   8   9  16   8  18  12 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   6   8   6   5   6   9  10   4   4   6   5   5   7   6   4   3   4   3   3   5 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75 
##   3   3   6   5   3   5   7   5   4   4   7   6   4   3   3
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k2value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k2clusters=nrow(unique(as.data.frame(clust)))

g <- buildSNNGraph(sce, k=3, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
## 545 138 441 576 417  42 416 481 463 302  80  34   5   6  16   7
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k3value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k3clusters=nrow(unique(as.data.frame(clust)))

g <- buildSNNGraph(sce, k=4, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15 
## 556 330 147 377 627 339 477 475 425  72  84  31   7   6  16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k4value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k4clusters=nrow(unique(as.data.frame(clust)))


g <- buildSNNGraph(sce, k=5, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9  10 
## 561 587 859 441 424 422 428 168  63  16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k5value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k5clusters=nrow(unique(as.data.frame(clust)))

g <- buildSNNGraph(sce, k=6, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9  10  11  12  13 
## 498 432 432 327 446 553 433 185 115 373  89  70  16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k6value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k6clusters=nrow(unique(as.data.frame(clust)))

g <- buildSNNGraph(sce, k=7, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9 
## 503 561 587 880 429 435 410 148  16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k7value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k7clusters=nrow(unique(as.data.frame(clust)))


g <- buildSNNGraph(sce, k=8, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9 
## 502 562 587 918 397 445 425 117  16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k8value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k8clusters=nrow(unique(as.data.frame(clust)))


g <- buildSNNGraph(sce, k=9, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8 
## 542 502 562 919 587 435 406  16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k9value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k9clusters=nrow(unique(as.data.frame(clust)))


g <- buildSNNGraph(sce, k=12, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9 
## 558 501 856 588 421  21 429 424 171
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k12value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k12clusters=nrow(unique(as.data.frame(clust)))

g <- buildSNNGraph(sce, k=13, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9 
## 558 501 837 589 476 440  21 418 129
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k13value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k13clusters=nrow(unique(as.data.frame(clust)))

g <- buildSNNGraph(sce, k=15, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##   1   2   3   4   5   6   7   8   9 
## 835 558 502 588 454 429 184 399  20
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k15value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k15clusters=nrow(unique(as.data.frame(clust)))

g <- buildSNNGraph(sce, k=20, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
##    1    2    3    4    5    6 
## 1040  578  818  589  502  442
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k20value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k20clusters=nrow(unique(as.data.frame(clust)))

values=data.frame(k2value,k3value,k4value,k5value,k6value,k7value,k8value,k9value,k12value,k13value,k15value,k20value)
clusters=data.frame(k2clusters,k3clusters,k4clusters,k5clusters,k6clusters,k7clusters,k8clusters,k9clusters,k12clusters,k13clusters,k15clusters,k20clusters)
values=as.matrix(values)
clusters=as.matrix(clusters)
plot(clusters,values)

When the cluster numbers are changed in as in the plot above (from k=2 to k=20), the ####AdjustedRandIndex values are changed accordingly. As we can see, when the number of clusters ####reaches the biological/expected number of clusters, the AdjustedRandIndex value goes close to ####0.48-0.5.

The cell types that are difficult to separate by clustering are: “cd4.t.helper cells”" and “regulatory t.cells”.